
tokenize <- function(x, ...) UseMethod("tokenize", x)

tokenize.default <- function(x, ...) stop("`tokenize` not implemented for object of class ", sQuote(class(x)[1]))
tokenize.corpus <- function(
  x
  , lang = "de"
  , stopwords = NULL
  , stem = TRUE
  , tolower = TRUE
  , ngrams = 1:2
  , .verbose = quanteda_options("verbose")
) {
  stopifnot(
    TRUE
    # , setNames(ISOcodes::ISO_639_2$Alpha_2, tolower(ISOcodes::ISO_639_2$Name))[SnowballC::getStemLanguages()]
    , is.null(stopwords) || (is.character(stopwords) & !any(is.na(stopwords)))
    , !is.null(stem) && is.atomic(stem) && is.logical(stem) && !is.na(stem)
    , "`ngrams` must a non-zero interger vector" = !is.null(ngrams) && is.atomic(ngrams) && length(ngrams) > 0 && is.integer(ngrams) && all(ngrams > 0)
  )
    
  toks <- tokens(
    x
    , what = "word"
    , remove_punct = TRUE
    , remove_numbers = TRUE
    , remove_symbols = TRUE
    , remove_separators = TRUE
    , split_hyphens = TRUE
    , padding = FALSE
    , verbose = .verbose
  )
  
  if (!is.null(stopwords)) {
    if (.verbose) 
      message("removing stopwords")
    toks <- tokens_remove(toks, pattern = stopwords)
  }
  if (stem) {
    if (.verbose) 
      message("stemming")
    toks <- tokens_wordstem(toks, language = lang)
  }
  if (tolower) {
    if (.verbose) 
      message("lowercasing")
    toks <- tokens_tolower(toks)
  }
  
  toks <- tokens_ngrams(toks, ngrams)
  
  return(toks)
}

tokenize.data.frame <- function(x, text.col, id.col, ...) {
  tokenize.corpus(corpus(x, docid_field = id.col, text_field = text.col), ...)
}
